In [2]:
%matplotlib qt4
from __future__ import division

from collections import defaultdict

from models import tools, optimize, models, filters
from models.tests import PerformanceTest

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

sns.set_style("ticks", {"legend.frameon": True})
mpl.rcParams['axes.color_cycle'] = ['#02A5F4', 'orange', 'green']

The Naive Way


In [3]:
data = tools.load_data(limit=40000, offset=2400000)


Loaded 29981 answers.

In [4]:
grad = optimize.NaiveDescent(data)

In [5]:
descent1 = grad.search_pfae(1.5, -2, step_size=3, maxiter=100, precision=0.005)


gamma: 3.49875484931; grad: -0.000740445772384
delta: 0.0056147289931; grad: -0.00073012553616

In [6]:
descent2 = grad.search_pfae(5, 0.5, step_size=2.5, maxiter=100, precision=0.005)


gamma: 4.1518680117; grad: 0.000929034961087
delta: -0.337658559106; grad: 0.000915898007934

In [7]:
descent3 = grad.search_pfag(1.5, -2, step_size=20, maxiter=36, precision=0.005)


gamma: 2.03872517589; grad: 0.000378512634818
delta: -1.38733242925; grad: 0.000385493414635

In [8]:
elo = models.EloModel()
pfae = models.PFAExt(elo, gamma=2.99622612646, delta=-0.476090204636)
pfae_test = PerformanceTest(pfae, data)
pfae_test.run()

In [9]:
pfae_test.results['train']


Out[9]:
RMSE: 0.333701136179
AUC: 0.835577838236
LL: -10914.7831739
OFF: -0.0153797301698
CORRECT: 25430
ACCURACY: 0.848203862446
Set Size: 29981

In [64]:
plt.figure(num=None, figsize=(5, 4.3), dpi=160)

def annotate(descent, number, mark, color, xadd, yadd):
    row = descent.params.loc[number]
    grad = descent.grads.loc[number]
    plt.annotate(r'$\gamma={}$, $\delta={}$'.format(round(row.gamma, 2), round(row.delta, 2)),
                 xy=(number, grad), xycoords='data',
                 xytext=(number + xadd, grad + yadd), textcoords='data',
                 bbox=dict(boxstyle="round", fc="w", linewidth=1, edgecolor=color))
    plt.plot(number, grad, mark, color=color, markeredgewidth=0, markersize=10)
    
#annotate(descent1, 1, 'go', 0.8, -0.006)
#annotate(descent1, 10, 'go', 0.8, -0.006)
annotate(descent1, 34, 'o', '#02A5F4', -15, -0.015)

#annotate(descent3, 1, 'ro', 0.7, 0.004)
#annotate(descent3, 11, 'ro', 0.8, 0.004)
annotate(descent3, 12, 'o', 'orange', 0.8, 0.009)

plt.xlabel('Number of iteration')
plt.ylabel(r'$\frac{1}{n}\sum(p_i - y_i)$')

plt.xlim([0, 35])
plt.ylim([-0.08, 0.04])

line1, = plt.plot(descent1.grads[:35], label=r'step = $3$', linewidth=2)
line2, = plt.plot(descent3.grads[:36], label=r'step = $20$', linewidth=2)

plt.legend(handles=[line1, line2], loc='lower right')
plt.tick_params(axis='both', which='major', labelsize=9)

plt.show()
plt.tight_layout()

The Proper Way


In [5]:
reload(filters)


Out[5]:
<module 'models.filters' from '/home/pavel/Projects/thesis/models/filters.py'>

In [7]:
data = tools.load_data(limit=10000, offset=90000)


Loaded 9593 answers.

In [4]:
data1 = data[filters.classmates(data)]
print len(data1)


117191

In [5]:
data2 = data[~filters.classmates(data)]
print len(data2)


854837

In [32]:
descents = {
    'In-School': (optimize.GradientDescent(data1), {}),
    'Out-of-School': (optimize.GradientDescent(data2), {}),
}

In [33]:
dresults = {}
for name, (descent, kwargs),  in descents.items():
    tools.echo(name, clear=False)
    dresults[name] = descent.search_staircase(
        init_learn_rate=0.015,
        number_of_iter=20,
        **kwargs
    )


Spaced Presentations
   2.50000    0.80000        inf
   2.63886    1.22040    0.00072
   2.39013    1.30804    0.00048
   2.25814    1.35747    0.00043
   2.17414    1.35478    0.00042
   2.12339    1.32521    0.00041
   2.09250    1.28376    0.00040
   2.07337    1.23897    0.00039
   2.06142    1.19527    0.00039
   2.05396    1.15464    0.00038
   2.04938    1.11780    0.00037
   2.04662    1.08480    0.00037
   2.04501    1.05542    0.00036
   2.04410    1.02934    0.00035
   2.04357    1.00621    0.00035
   2.04323    0.98573    0.00034
   2.04293    0.96759    0.00034
   2.04259    0.95153    0.00033
   2.04216    0.93732    0.00033
   2.04161    0.92477    0.00032
   2.04092    0.91369    0.00032
Massed Presentations
   2.50000    0.80000        inf
   2.82707    1.70931   -0.00139
   2.80447    1.46832   -0.00189
   2.81617    1.27605   -0.00162
   2.81853    1.15835   -0.00154
   2.81909    1.08448   -0.00152
   2.82028    1.03651   -0.00152
   2.82209    1.00476   -0.00152
   2.82406    0.98374   -0.00152
   2.82579    0.97010   -0.00153
   2.82707    0.96169   -0.00154
   2.82785    0.95704   -0.00155
   2.82812    0.95514   -0.00156
   2.82795    0.95525   -0.00157
   2.82740    0.95681   -0.00157
   2.82655    0.95943   -0.00158
   2.82546    0.96279   -0.00159
   2.82420    0.96666   -0.00159
   2.82281    0.97087   -0.00160
   2.82135    0.97528   -0.00160
   2.81984    0.97978   -0.00161

In [36]:
plots = []
for name, dresult in dresults.items():
    p, = dresult.plot()
    plots += [(name, p, dresult)]

if len(plots) > 1:
    gamma_delta = ' ($\gamma = {0[gamma]:.3f}, \delta = -{0[delta]:.3f}$)'
    plt.legend([item[1] for item in plots],
               [n + gamma_delta.format(r.best) for n, p, r in plots])

10 Runs with Std Err


In [5]:
max_size = 100000
slices = 7
descents_10 = (
    ('Lakes', lambda d: filters.place_type(d, 'lake') & filters.for_staircase(d), 4),
    ('Rivers', lambda d: filters.place_type(d, 'river') & filters.for_staircase(d), 1),
    ('Mountains', lambda d: filters.place_type(d, 'mountains') & filters.for_staircase(d), 1),
)

dresults_10 = defaultdict(list)

for name, filter_fun, mul,  in descents_10:
    tools.echo(name, clear=False)

    train_data = []
    for i in range(slices):
        limit, offset = 5e5 * mul, (i * 1e6) + 5e5 + (5e5 * mul)
        df = tools.load_data(limit=limit, offset=offset, echo_loaded=False)
        df = df[filter_fun(df)][:max_size]
        train_data.append(df.copy())
        tools.echo('[{}]: Loaded {} answers.'.format(i, len(df)), clear=False)

    tools.echo('Data loaded.', clear=False)

    results_classmates = []
    for i in range(slices):
        descent = optimize.GradientDescent(train_data[i])
        res = descent.search_staircase(init_learn_rate=0.02, number_of_iter=15,
                                       echo_iterations=False)
        dresults_10[name].append(res)
        tools.echo('[{}]: done!'.format(i), clear=False)


Lakes
[0]: Loaded 2943 answers.
[1]: Loaded 3610 answers.
[2]: Loaded 4613 answers.
[3]: Loaded 4801 answers.
[4]: Loaded 3781 answers.
[5]: Loaded 4176 answers.
[6]: Loaded 4008 answers.
Data loaded.
[0]: done!
[1]: done!
[2]: done!
[3]: done!
[4]: done!
[5]: done!
[6]: done!
Rivers
[0]: Loaded 16243 answers.
[1]: Loaded 17666 answers.
[2]: Loaded 50751 answers.
[3]: Loaded 54152 answers.
[4]: Loaded 52914 answers.
[5]: Loaded 33936 answers.
[6]: Loaded 41800 answers.
Data loaded.
[0]: done!
[1]: done!
[2]: done!
[3]: done!
[4]: done!
[5]: done!
[6]: done!
Mountains
[0]: Loaded 21970 answers.
[1]: Loaded 18447 answers.
[2]: Loaded 44145 answers.
[3]: Loaded 35698 answers.
[4]: Loaded 38264 answers.
[5]: Loaded 24397 answers.
[6]: Loaded 44313 answers.
Data loaded.
[0]: done!
[1]: done!
[2]: done!
[3]: done!
[4]: done!
[5]: done!
[6]: done!

In [6]:
def get_gamma_delta(descent_results):
    gamma_std = np.std([res.gammas[-1] for res in descent_results])
    delta_std = np.std([res.deltas[-1] for res in descent_results])
    gamma_mean = np.mean([res.gammas[-1] for res in descent_results])
    delta_mean = np.mean([res.deltas[-1] for res in descent_results])
    return {
        'std': [gamma_std, delta_std],
        'avg': [gamma_mean, -delta_mean],
    }

In [7]:
def prepare_plot_data(descent_results):
    x_matrix = []
    y_matrix = []
    for res in descent_results:
        stairs = sorted(res.staircases[-1].items(), key=lambda x: x[0])
        staircase_times = res.model.metadata['staircase_times']

        xi_axis = [np.mean(staircase_times[i]) for i in res.intervals]
        yi_axis = [value for interval, value in stairs]

        x_matrix.append(xi_axis)
        y_matrix.append(yi_axis)

    x_axis = []
    y_axis = []
    e_vals = []
    for i in range(len(x_matrix[0])):
        x_axis += [np.mean([x_matrix[j][i] for j in range(len(x_matrix))])]
        y_axis += [np.mean([y_matrix[j][i] for j in range(len(x_matrix))])]
        e_vals += [np.std([y_matrix[j][i] for j in range(len(x_matrix))])]
    
    return x_axis, y_axis, e_vals

In [40]:
plots = []
labels = []

fig = plt.figure(num=None, figsize=(7, 4), dpi=120)
ax = plt.subplot(111)

lines = ['o-', 's-', '^-']

for i, (name, results_10) in enumerate(dresults_10.items()):
    x_axis, y_axis, e_vals = prepare_plot_data(results_10)
    if len(dresults_10) == 1:
        ax.errorbar(x_axis, y_axis, e_vals,
                    ecolor='orange', elinewidth='2',
                    linestyle='--', linewidth='2',
                    capthick='2', capsize=4,
                    color='#02A5F4', marker='o')
    p, = ax.plot(x_axis, y_axis, lines[i % 3], label=name)
    plots.append(p)
    labels.append(name)
    
    tools.echo(name, clear=False)
    tools.echo('x: {}'.format([round(x, 1) for x in x_axis]), clear=False)
    tools.echo('y: {}'.format([round(y, 3) for y in y_axis]), clear=False)
    
    gamma_delta = get_gamma_delta(results_10)
    std_msg = 'std: gamma={:.3f}, delta={:.3f}'
    avg_msg = 'avg: gamma={:.3f}, delta={:.3f}'
    tools.echo(std_msg.format(*gamma_delta['std']), clear=False)
    tools.echo(avg_msg.format(*gamma_delta['avg']), clear=False)
    
    x_pos, y_pos = x_axis[5], y_axis[5]
    info = [round(x, 3) for x in gamma_delta['avg']]
    plt.annotate(r'$\gamma={0}$, $\delta={1}$'.format(*info),
                 xy=(x_pos, y_pos), xycoords='data', size='small',
                 xytext=(x_pos, y_pos + 0.15), textcoords='data',
                 bbox=dict(boxstyle="round", fc="w", linewidth=1))

plt.xscale('log')
plt.xlabel('Time from previous attempt in seconds')
plt.ylabel('Increase in memory activation')
plt.xlim([25, 1e6])

# Shrink current axis by 20%
box = ax.get_position()
ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])

# Put a legend to the right of the current axis
legend = ax.legend(loc='center left', bbox_to_anchor=(1.05, 0.5), prop={'size': 12})
legend.get_frame().set_linewidth(1)

plt.show()
plt.tight_layout()
plt.subplots_adjust(right=0.73)  # adjust for the legend to fit


Mountains
x: [40.7, 74.6, 115.8, 205.4, 417.3, 1005.1, 4431.1, 47609.9, 195244.5, 632093.2]
y: [1.108, 0.907, 0.795, 0.598, 0.38, 0.445, 0.458, 0.323, 0.025, -0.285]
std: gamma=0.221, delta=0.131
avg: gamma=2.108, delta=-0.709
Lakes
x: [41.9, 73.7, 115.8, 204.5, 417.9, 1019.2, 4499.1, 45133.6, 219802.8, 938256.9]
y: [1.176, 1.122, 1.114, 1.045, 0.975, 0.692, 0.462, 0.529, 0.249, 0.004]
std: gamma=0.200, delta=0.216
avg: gamma=2.699, delta=-0.618
Rivers
x: [40.7, 74.7, 115.7, 205.6, 415.1, 1023.5, 4635.0, 48792.6, 198540.1, 621875.5]
y: [1.001, 0.746, 0.609, 0.481, 0.387, 0.38, 0.272, 0.166, 0.153, -0.164]
std: gamma=0.216, delta=0.132
avg: gamma=2.554, delta=-0.766

In [21]:
descent = optimize.GradientDescent(data)

In [22]:
r = descent.search_staircase(init_gamma=-1, init_delta=1, number_of_iter=1)


  -1.00000    1.00000        inf
   3.74150   -1.08690    0.04555

In [35]:
frame, (ax1, ax2) = plt.subplots(1, 2, figsize=(7, 3), dpi=160)

ax1.plot(r.model.metadata['gammas'], '-',
         r.model.metadata['deltas'], '-')
ax1.set_xlabel('Number of answers')
ax1.set_ylabel('Value of $\gamma$ and $\delta$')

ax2.plot([x[0] for x in r.model.metadata['rmse']],
         [x[1] for x in r.model.metadata['rmse']],
         'g-^')
ax2.set_xlabel('Number of answers')
ax2.set_ylabel('RMSE')

ax1.text(9000, 3, r'$\gamma$', fontsize=16, color='#02A5F4')
ax1.text(9000, -0.8, r'$\delta$', fontsize=16, color='orange')
#plt.text(60, .025, r'$\mu=100,\ \sigma=15$')

plt.tight_layout()

In [ ]: